#importing seaborn
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
"""
--relplot() --scatterplot() --lineplot()
--catplot() --boxplot() --stripplot() --swarmplot() --etc...
--distplot() --kdeplot() --jointplot() --rugplot()
--regplot() --lmplot()
--figure styling
--axes styling
--color styling
--etc ....
"""
# NUMERICAL DATA PLOT :
#-relplot()
#-scatterplot()
#-lineplot()
#NUMERIC
sns.set(style="darkgrid")
tips=sns.load_dataset('tips')
tips.head()
#RELPLOT() [RELATIONAL PLOT]
sns.relplot(x="total_bill",y='tip',data=tips) #RELATIONAL PLOT BETWEEN TWO VARIABLES.
dir(sns.FacetGrid) #OPERATIONS THAT CAN BE PERFORMED.
sns.relplot(x='total_bill',y='tip',data=tips,hue='smoker') #hue for categorizing variables
sns.relplot(x='total_bill',y='tip',data=tips,hue='smoker',style='time') #style for representation of categories
sns.relplot(x='total_bill',y='tip',hue='smoker',style='time',data=tips,palette='ch:r=-0.9,l=0.75') #lightining and Darkening
sns.relplot(x='total_bill',y='tip',data=tips,size='size',hue='sex',style='day') #size Changes size
sns.relplot(x='total_bill',y='tip',data=tips,size='size',sizes=(15,200),hue='sex',style='time',palette='ch:r=-0.9,l=0.75')
#sizes range
from numpy.random import randn
df=pd.DataFrame(dict(time=np.arange(500),value=randn(500).cumsum()))#CUMMILATIVE SUM
df.head()
sns.relplot(x='time',y='value',kind='line',data=df)
sns.relplot(x='time',y='value',kind='scatter',data=df,hue='value')
df=pd.DataFrame(randn(500,2).cumsum(axis=0),columns=['time','value'])
df.head()
sns.relplot(x='time',y='value',kind='line',data=df,sort=False)
sns.relplot(x='time',y='value',kind='line',data=df,sort=True)
#Repeated Measures data
fmri=sns.load_dataset('fmri')
#MULTIPLE VALUES FOR SAME TIME POINT
fmri.head()
sns.relplot(x='timepoint',y='signal',kind='line',data=fmri)#ci confidence interval is true
sns.relplot(x='timepoint',y='signal',kind='line',data=fmri,ci=False)#ci confidence interval is false
sns.relplot(x='timepoint',y='signal',kind='line',data=fmri,ci='sd')#ci confidence interval
sns.relplot(x='timepoint',y='signal',kind='line',estimator=None,data=fmri)
#ci confidence interval removed.
#i.e Estimator is None ..
sns.relplot(x='timepoint',y='signal',kind='line',data=fmri,ci='sd',hue='event')#ci confidence interval
sns.relplot(x='timepoint',y='signal',hue='region',style='event',kind='line',data=fmri,markers=True,dashes=False)
sns.relplot(x='timepoint',y='signal',hue='region',style='region',kind='line',data=fmri,markers=True,dashes=False)
sns.relplot(x='timepoint',y='signal',hue='region',style='event',kind='line',data=fmri,markers=True,dashes=True)
sns.relplot(x='timepoint',y='signal',hue='event',style='event',kind='line',data=fmri)
#Differenshiating same in different styles
sns.relplot(x='timepoint',y='signal',hue='event',units='subject',estimator=None,kind='line',data=fmri)
sns.relplot(x='timepoint',y='signal',hue='region',units='subject',estimator=None,kind='line',data=fmri.query("event=='stim'"))
#Querying the required data.
dots=sns.load_dataset("dots").query("align=='dots'") #querying only dots out
dots.head()
sns.relplot(x='time',y='firing_rate',data=dots,kind='line',hue='coherence',style='choice')
palette=sns.cubehelix_palette(light=0.5,n_colors=6)
#here we are giving different colors to coherence which has 6 categories
sns.relplot(x='time',y='firing_rate',data=dots,kind='line',hue='coherence',style='choice',palette=palette)
sns.relplot(x='time',y='firing_rate',hue='coherence',size='coherence',style='choice',kind='line',data=dots,sizes=(1,5))
df=pd.DataFrame(dict(time=pd.date_range('2019-06-02',periods=500),value=randn(500).cumsum()))
df.head()
g=sns.relplot(x='time',y='value',kind='line',data=df)
#Dates are not clearly visible
g=sns.relplot(x='time',y='value',kind='line',data=df)
g.fig.autofmt_xdate()
#USING FACEDGRID YOU CAN DRAW MULTIPLE PLOTS.
#LETS CHECK OUT TIPS DATASET.
tips.head()
sns.relplot(x='total_bill',y='tip',hue='smoker',data=tips)
#HERE WE CAN SEE WE HAVE A COMBINED PLOT LETS CATEGORISE THIS PLOT INTO MULTIPLE plots USING FACEDGRID.
sns.relplot(x='total_bill',y='tip',hue='smoker',col='smoker',data=tips)
#WE categorised combined plot into two individual plots
#Similarly
sns.relplot(x='total_bill',y='tip',hue='smoker',col='time',data=tips)
sns.relplot(x='total_bill',y='tip',hue='smoker',col='size',data=tips)
sns.relplot(x='total_bill',y='tip',hue='smoker',row='size',data=tips)#GIVES VERTICALLY ROW WISE.
sns.relplot(x='timepoint',y='signal',hue='subject',col='region',row='event',height=3,kind='line',ci='sd',data=fmri)
#GIVES 2 CROSS 2
#HERE COL WISE CATEGORIZATION
#HERE ROW WISE CATEGORIZATION
sns.relplot(x='total_bill',y='tip',hue='smoker',col='size',data=tips,col_wrap=3,height=3)
#HERE WE WANT 2*3 SO WE KNOW 6 CATEGORIZES SO TO GET 2 CROSS 3 WE PASS COL_WRAP=3
#LETS SEE LINE PLOT NOW FOR NUMERICAL DATA.
sns.lineplot(x='total_bill',y='tip',data=tips)
#NOW LETS SEE A SCATTER PLOT
sns.scatterplot(x='total_bill',y='tip',data=tips)
sns.lineplot(x='timepoint',y='signal',data=fmri)
sns.lineplot(x='timepoint',y='signal',hue='event',style='event',markers=True,ci='sd',data=fmri)
sns.lineplot(x='timepoint',y='signal',hue='event',style='event',markers=True,ci='sd',data=fmri,err_style='bars')
sns.lineplot(x='timepoint',y='signal',hue='event',style='event',markers=True,ci=68,data=fmri,err_style='bars')
sns.lineplot(x='timepoint',y='signal',hue='region',style='event',markers=True,ci=68,data=fmri,err_style='bars')
#err_style represnts in bar and gives us approximated error and then ci should be a number.
sns.lineplot(x='time',y='firing_rate',hue='coherence',style='choice',data=dots)#same using line plot
sns.scatterplot(x='total_bill',y='tip',data=tips,hue='smoker',size='size',style='time')#similar we got in relplot
#lets LOAD IRIS DATA AND DRAW SCatter plot
iris=sns.load_dataset('iris')
iris.head()
sns.scatterplot(x='sepal_length',y='petal_length',data=iris) #BOTH ARE SAME.
sns.scatterplot(x=iris['sepal_length'],y=iris['petal_length']) #ANOTHER WAY.
#CATEGORICAL DATA REPRESENTATION.
#CATEGORICAL DATA PLOT :
# --catplot()
#--boxplot()
#--stripplot()
#--swarmplot()
#--etc...
#LETS TAKE TIPS DATA
tips.head()
sns.catplot(x='day',y='total_bill',data=tips) #DEFAULTLY JITTER IS TRUE
sns.catplot(x='day',y='total_bill',data=tips,jitter=False)#TO REMOVE JITTER AND GET A SINGLE LINE,JITTER =FALSE
sns.catplot(x='day',y='total_bill',data=tips,kind='swarm')#like a swarm it differenshiates
sns.catplot(x='day',y='total_bill',data=tips,kind='swarm',hue='size')
sns.catplot(x='day',y='total_bill',data=tips,kind='swarm',hue='sex')
sns.catplot(x='smoker',y='tip',data=tips,order=['No','Yes'])
tips.head()
sns.catplot(x='day',y='total_bill',kind='box',hue='sex',data=tips) #BOX PLOTS FOR CATEGORICAL DATA
sns.catplot(x='day',y='total_bill',kind='box',hue='time',data=tips)
#TO COMINE SUB CATEGORICAL PLOTS INTO 1 BOX PLOT USE DODGE=FALSE
sns.catplot(x='day',y='total_bill',kind='box',hue='time',data=tips,dodge=False)
sns.catplot(x='day',y='total_bill',kind='box',hue='sex',data=tips,dodge=False) #similarly
diamonds=sns.load_dataset('diamonds')
diamonds.head()
sns.catplot(x='color',y='price',kind='boxen',data=diamonds.sort_values('color'))#BOXEN MAKES QUARTILE INTO BOXES
sns.catplot(x='day',y='total_bill',kind='boxen',data=tips,dodge=False)
sns.catplot(x='total_bill',y='day',hue='sex',kind='violin',data=tips,split=False,inner='stick')
#HERE SPLIT IS SIMILAR TO DODGE
#INNER STICK DRAWS LINES INPLOT
#VIOLIN IS SIMILAR TO BOXPLOT BUT INSTEAD REPRESENATION IS IN FORM OF VIOLIJN (OR) GUITAR
#inside violin plot we have boxplot
sns.catplot(x='total_bill',y='day',hue='sex',kind='violin',data=tips,split=False)#AS WE REMOVED STICK WE GET BOXPLOT
sns.catplot(x='total_bill',y='day',hue='sex',kind='violin',data=tips,split=True,inner='stick')
#split is true it combines it.
sns.catplot(x='total_bill',y='day',hue='time',kind='violin',data=tips,split=False,inner='stick')
sns.catplot(x='total_bill',y='day',hue='time',kind='violin',data=tips,split=True,inner='stick')
#SWARM and VIOLIN plot together
g=sns.catplot(x='day',y='total_bill',kind='violin',inner=None,data=tips)
sns.swarmplot(x='day',y='total_bill',color='k',size=3,data=tips,ax=g.ax)
#HERE WE COMBINED BOTH THE PLOTS SWARM AND VIOLIN PLOT
titanic=sns.load_dataset("titanic")
titanic.head()
sns.catplot(x='sex',y='survived',hue='class',kind='bar',data=titanic)
#IN BAR IF WE HAVE STRAIGHTEN LINES THEN IT MEANS IT HAS HIGH UNCERTAINITY
sns.catplot(x='deck',kind='count',palette='ch:0.25',data=titanic,hue='class')
sns.catplot(x='sex',y='survived',hue='class',kind='bar',data=titanic,palette='ch:0.25')
#here we can see difference between bar and point
#BIGGER THE LINE ABOVE THE BAR MEANS HIGH UNCERTAINITY
#IF ITS SMALLER THEN LESS UNCERTAINITY
sns.catplot(x='sex',y='survived',hue='class',kind='point',data=titanic,palette='ch:0.25')
#VISUALIZING THE DISTRIBUTION OF THE DATA
#--distplot()
#--kdeplot()
#--jointplot()
#--rugplot()
#UNIVARIATE DISTRIBUTION: (DISTRIBUTION PLOT)
x=randn(100)
sns.distplot(x)
sns.distplot(x,kde=False)
sns.distplot(x,kde=False,rug=True)
#RUG DRAWS LINES AT THE BOTTOM
sns.distplot(x,kde=False,rug=True,bins=30)
#RUG DRAWS LINES AT THE BOTTOM
sns.distplot(x,kde=True,hist=False,rug=False,bins=30)
sns.kdeplot(x,shade=True)
sns.kdeplot(x,shade=True,cbar=True,kernel='gau',bw='scott',cumulative=True)
sns.kdeplot(x,shade=True,cbar=True,bw=0.2,cut=10)#bw(Bnadwidth)
sns.kdeplot(x,shade=True,cbar=True,bw=1,cut=0)#bw(Bnadwidth)[it gives smooth curve]
#bivaritae Distribution
tips.head()
x=tips['total_bill']
y=tips['tip']
sns.set()
sns.jointplot(x='total_bill',y=y,data=tips,height=6,space=0.2,kind='hex',color='r',marginal_kws=dict(bins=15,rug=True),annot_kws=dict(stat='r'),edgecolor="w",linewidth=1)
sns.jointplot(x=x,y=y,data=tips)
sns.jointplot(x=x,y=y,kind='hex')
sns.axes_style('white')
sns.jointplot(x=x,y=y,kind='kde')
sns.axes_style('white')
sns.set()
#USE ALONG SIDE KDE PLOT ONLY
f,ax=plt.subplots(figsize=(6,6))
cmap=sns.cubehelix_palette(as_cmap=True,dark=0,light=1,reverse=True)
sns.kdeplot(x,y,cmap=cmap,n_levels=60,shade=True)
g=sns.jointplot(x,y,kind='kde',color='m')
g=sns.jointplot(x,y,kind='kde',color='m')
g.plot_joint(plt.scatter,c='w',s=30,linewidth=1,marker='+')
g.ax_joint.collections[0].set_alpha(0)
#MULTIVARIATE
#Pairplot(Multipleplot)
sns.pairplot(iris)
#HERE WE GET NON DIAGONAL SCATTER PLOT
#&DIAGONAL UNIVARIATE BAR GRAPH
#to change this in pairplot
#HERE WE GET NON DIAGONAL SCATTER PLOT
#&DIAGONAL UNIVARIATE BAR GRAPH
g=sns.PairGrid(iris)
g.map_diag(sns.kdeplot)
g.map_offdiag(sns.kdeplot,n_levels=10)
#MULTIVARIATE ANALYSIS
#LINEAR REGRESSION AND RELATIONSHIP related figures.
sns.regplot(x='total_bill',y='tip',data=tips)
#ANOTHER WAY
sns.lmplot(x='total_bill',y='tip',data=tips)
sns.lmplot(x='size',y='tip',data=tips,x_jitter=0.05) #X AXIS JITTER
sns.lmplot(x='size',y='tip',data=tips,x_estimator=np.mean)
#IT CALCULATES MEAN AND DRAWS A LINE
#dots represent the mean.
sns.lmplot(x='size',y='tip',data=tips,x_estimator=np.median)
#Similarly the points represent the median over here.
data=sns.load_dataset('anscombe')
data.head()
sns.lmplot(x='x',y='y',data=data)
sns.lmplot(x='x',y='y',data=data.query("dataset=='I'"),ci=100,scatter_kws={'s':80})
sns.lmplot(x='x',y='y',data=data.query("dataset=='I'"),ci=None,scatter_kws={'s':80})
#NO CONFIDENCE INTERVALS
sns.lmplot(x='x',y='y',data=data.query("dataset=='II'"),order=2,ci=None,scatter_kws={'s':80})
#HERE we can see there is no linear relationship but a polynomiyal relation that is to fit a polynomiyal relationship
#ORDER=2 #order is used to fit ploynomiyal relation.
sns.lmplot(x='x',y='y',data=data.query("dataset=='III'"),ci=None,scatter_kws={'s':80})
#HERE WE CAN SEE AN OUTLIER EXISTS AND BECAUSE OF THAT THE BEST FIT LINE HAS SHIFTED.
#TO AVOID THAT USE ROBUST=TRUE[IT IGNORES OUTLIERS AND FITS BEST FIT LINE]
sns.lmplot(x='x',y='y',data=data.query("dataset=='II'"),order=3)
sns.lmplot(x='x',y='y',data=data.query("dataset=='III'"),robust=True,ci=None,scatter_kws={'s':80})
#Robust fits perfect straight line for linear relationship and ignores outliers
sns.lmplot(x='total_bill',y='tip',data=tips,hue='sex',markers=['o','x'],col='time',row='smoker')
f,ax=plt.subplots(figsize=(8,4))
sns.regplot(x='total_bill',y='tip',data=tips,ax=ax)
sns.lmplot(x='total_bill',y='tip',data=tips,col='day',col_wrap=2,height=4)
#CONTROLLING PLOTTED FIGURES AESTHETICS :
#--figure styling
#--axes styling
#--color styling
#--etc ....
def sinplot(flip=1):
x=np.linspace(0,14,100)
for i in range(1,7):
plt.plot(x,np.sin(x+i*0.5)*(7-i)*flip)
sinplot(-1)
sns.set_style('whitegrid')
sinplot()
sns.set_style('dark')
sinplot()
sns.set_style('white')
sinplot()
sns.set_style('ticks')
sinplot()
sns.set_style('ticks')
sinplot()
sns.despine()
sns.set_style('ticks')
sinplot()
sns.despine(left=True)
sns.set_style('ticks')
sinplot()
sns.despine(left=True,bottom=True)
sns.axes_style()
sns.set_style('ticks',{'axes.grid':True,'xtick.direction':'in'})
sinplot()
sns.despine(left=True,bottom=True)
sns.set_style('ticks',{'axes.grid':True,'xtick.direction':'in'})
sinplot()
sns.despine(left=True,bottom=False)
sns.set_style('darkgrid')
sinplot()
sns.set_style('darkgrid')
sns.set_context('poster')
sinplot()
sns.set_style('darkgrid')
sns.set_context('paper')
sinplot()
sns.set_style('darkgrid')
sns.set_context('talk',font_scale=1.5)
sinplot()
current_palettes=sns.color_palette()
sns.palplot(current_palettes)
current_palettes=sns.color_palette()
sns.palplot(current_palettes)
sns.palplot(sns.color_palette('hls',8))
#SUMMARY:
#distplot-Univariate Analysis
#jointplot-Bivariate Analysis
#pairplot-MULTIVARIATE analysis
import seaborn as sns
df=sns.load_dataset('tips')
df.head()
df.corr()#correlation happens only among int and float
sns.heatmap(df.corr(),annot=True)
#Bivariate Analysis
sns.jointplot(x='tip',y='total_bill',data=df,kind='hex')
sns.jointplot(x='tip',y='total_bill',data=df,kind='reg')
#Pairplot
sns.pairplot(df)
sns.pairplot(df,hue='sex',height=5)
#Distplot
sns.distplot(df['tip'])#HERE KDE=TRUE AS DEFAULT so on Y axis it shows density
sns.distplot(df['tip'],kde=False,bins=10)#HERE AS KDE IS FALSE ON Y AXIS WE GET COUNT NOW INFORM OF A HISTOGRAM
#CATEGORICAL PLOTS
#-boxplot
#violinplot
#countplot
#barplot
#Countplot :IN THIS WE ONLY CAN GIVE EITHER X OR Y VALUE
sns.countplot('day',data=df)
sns.countplot(y='day',data=df)
#Barplot
sns.barplot(x='total_bill',y='sex',data=df)
sns.barplot(x='sex',y='total_bill',data=df)
#BOXPLOT
sns.boxplot(x='sex',y='total_bill',data=df)
sns.boxplot(x='day',y='total_bill',data=df,palette='rainbow')
sns.boxplot(data=df,orient='r')
sns.boxplot(x='total_bill',y='day',hue='smoker',data=df)
#VIOLINPLOT: HELPS TO SEE DISTRIBUTION OF DATA IN TERMS OF KERNEL DENSITY ESTIMATION IN THE BOXPLOT
sns.violinplot(x='total_bill',y='day',data=df,palette='rainbow')
#THANKYOU BY RAMA VEERA ISAIAH.